
# generate a dictionary, apply it to the corpus, merge it with covariates 

# load packages
library(rio) # load data
library(tidyverse) # data manipulation
library(quanteda) # text analysis


# set working directory
setwd("~/replication_files/")

# load corpus
load("data/article_iv_corpus.Rdata")

# load covariates (note that this includes variables we coded manually, based on the corpus above, about whether consultations promote natural resource governance etc)
covariates <- import("data/covariates.csv")

# number of words per consultation
article_iv_corpus$n_words <- str_count(article_iv_corpus$staff_appraisal, '\\w+')

# treat "International Monetary Fund" as one word that should be removed, do the same for "World Bank"
article_iv_corpus$staff_appraisal <- str_replace(article_iv_corpus$staff_appraisal, "International Monetary Fund", "imf")
article_iv_corpus$staff_appraisal <- str_replace(article_iv_corpus$staff_appraisal, "World Bank", "WorldBank")

# make sure that "non-oil" is treated as a word of its own
article_iv_corpus$staff_appraisal <- str_replace(article_iv_corpus$staff_appraisal, "non-oil", "nonoil")

article_iv_corpus <- article_iv_corpus %>% 
  unite("doc_id", c("iso3c", "year"), sep = " ", remove = FALSE) 

# regenerate corpus, including these changes
articleiv_corpus <- corpus(article_iv_corpus, text_field = "staff_appraisal")

# pre-processing
articleiv_tokens <- tokens(articleiv_corpus) %>%
  tokens_tolower() 

# set search terms
natural_resource_dictionary <- dictionary(list(resources = c("natural resource", "natural resources", "extractive industry", "extractive industries", "oil","petroleum",
                                                             "crude oil", "gasoline", "diesel", "LNG", "natural gas",
                                                             "fuel", "fuels","energy", "refinery", "hydrocarbon", "mineral",
                                                             "mining", "mine", "mines", "copper", "gold", "diamond",
                                                             "iron", "steel", "phosphate", "eiti")))

# run search on tokens
find_tokens <- tokens_lookup(articleiv_tokens, natural_resource_dictionary, valuetype = "glob", verbose = TRUE)

### absolute values (used in the main analysis)
search_hits_abs <- dfm(find_tokens) 

# convert the resulting dfm to data frame
search_results_abs <- quanteda::convert(search_hits_abs, to = "data.frame") %>% 
  rename(resource_mentions_absolute = resources)

### weighted values (used as a robustness check - see appendix F2)
# weigh a dfm by term frequency-inverse document frequency (tf-idf)
# what this means: the weight of a term that occurs in a document is proportional to the term frequency
# but it is offset by the number of documents in the corpus that contain the word (inverse function of the frequency)

# create a document-feature matrix (a dfm object) from a token object - now weighted
search_hits_tfidf <- dfm_tfidf(search_hits_abs)

# convert the resulting dfm to data frame
search_results_tfidf <- quanteda::convert(search_hits_tfidf, to = "data.frame") %>% 
  rename(resource_mentions_tfidf = resources)


# generate one single dataset: absolute values, weighted values, and covariates
data_with_dictionary <- search_results_abs %>%
  full_join(search_results_tfidf) %>%
  full_join(article_iv_corpus) %>%
  dplyr::select(-c(doc_id,country,date_pub,title,staff_appraisal)) %>%
  right_join(covariates) %>%
  mutate(across(resource_mentions_absolute:resource_mentions_tfidf, .fns = ~replace_na(.,0))) %>% # resource mentions should be zero in country-years without consultations
  group_by(iso3c) %>%
  arrange(year, .by_group = T) %>%
  mutate(time = 1:length(iso3c),
         article_iv_promotes_governance_lag = dplyr::lag(article_iv_promotes_governance, n = 1L, default = 0), # lag dummy indicating whether the consultation promotes natural resource governance (see appendix E3)
         article_iv_mentions_resources = ifelse(resource_mentions_absolute > 0, 1, 0), # dummy: does the consultation mention natural resources at all?
         article_iv_mentions_resources_lag = dplyr::lag(article_iv_mentions_resources, n = 1L, default = 0), # lag this dummy
         cumulative_mentions = cumsum(article_iv_mentions_resources_lag), # cumulative number of natural resource mentions
         previous_article_iv_mentions_resources = ifelse(cumulative_mentions > 0, 1, 0), # does any previous consultation mention natural resources?
         resource_mentions_absolute_lag = dplyr::lag(resource_mentions_absolute, n = 1L, default = 0), # lag number of absolute mentions to avoid simultaneity bias
         resource_mentions_tfidf_lag = dplyr::lag(resource_mentions_tfidf, n = 1L, default = 0)) %>% # lag number of relative mentions to avoid simultaneity bias
  ungroup() %>%
  dplyr::select(iso3c,year,policy_passage,previous_policy,article_iv_publication,n_words,article_iv_promotes_governance,article_iv_promotes_governance_lag,
                article_iv_mentions_resources,article_iv_mentions_resources_lag,resource_mentions_absolute,resource_mentions_absolute_lag,
                resource_mentions_tfidf,resource_mentions_tfidf_lag,previous_article_iv_mentions_resources,
         imf_program:time) %>%
  filter(iso3c!="ERI" & iso3c!="TKM" & iso3c!="VEN" & year>2003) # remove countries for which no consultations are available, make sure the analysis begins in 2004

export(data_with_dictionary,"data/full_data.csv")

